### checking correlation covariates + outcomes

# Packages ----------------------------------------------------------------
pacman::p_load(tidyverse, scales,  knitr, srvyr, extrafont, 
               rebus, broom, tidyr, lubridate, here, ggdist, ggtext, ggalt, janitor, 
               broom, tidyr,  stargazer, janitor, estimatr, list)

pacman::p_load(here, DeclareDesign, tidyverse, fabricatr, wesanderson, tidyr, patchwork)

# run code with the modelling functions
source(here("code","_utils.R"))

# Data wrangling ----------------------------------------------------------

# open dataset
d <- readRDS(here("data","all_processed_data.rds"))

# select variables for control
add.vars = c(           
    #demographics
    "w1_q_gender" ,  
    "q_race" ,  
    "w1_q_education" ,
    "w1_q_age_num",
    "income_num",
    
    
    # trust  
    "as.numeric(w1_q_trust_government)" ,
    "as.numeric(w1_q_trust_congress)" ,
    "as.numeric(w1_q_trust_supreme_court)" ,
    "as.numeric(w1_q_trust_electoral_authorities)" ,
    "as.numeric(w1_q_trust_globo)" ,
    "as.numeric(w1_q_trust_news_channels)" ,
    "w1_q_legitimacy" ,
    
    # polarization    
    "w1_affective_pol_bolsonaro" ,
    " w1_affective_pol_lula" ,
    
    # ideology  
    "w1_ideology_you" ,  
    "w1_q_politics_num" ,   
    
    # social media usage
    "w1_q_whatsapp_num ",
    "as.numeric(w1_q_fake_news)" ,
    "as.numeric(w1_q_whatsapp_purposes_news)" ,
    "as.numeric(w1_q_whatsapp_purposes_pay_bills)" ,
    "as.numeric(w1_q_whatsapp_purposes_family)" ,
    "as.numeric(w1_q_whatsapp_purposes_groups)"
)


# extract the joint correlation betwen these variables and the outcomes
depvar = c("false_false_sum","true_true_sum", "false_news_exp", "true_news_e", "pol_all_zcore", 
           "swb_zcore")
out = list()
for (outcome in depvar){
    
    print(outcome)
    model.expr = as.formula(paste(outcome, " ~ ",
                                  paste(add.vars, collapse = "+")))
    
    out[[outcome]] = lm_robust(model.expr, data = d, se_type="HC0")$r.squared
    
}

# get the correlation between outcome ~ X
corr <- out %>% 
    as_tibble() %>% 
    pivot_longer(cols = everything()) %>%
    mutate(corr=sqrt(value))

# Extract the mean effects and Standard deviation of the outcomes
results = list()
for (outcome in depvar){
    
    # data    
    summ = d %>% 
        select(outcome, exp) %>%
        group_by(exp) %>% 
        summarise(sd=sd(.data[[outcome]], na.rm=TRUE),
                  m=mean(.data[[outcome]], na.rm=TRUE))
    
    # Store the result in the list
    results[[outcome]] <- summ
    
}


## Step 1: Define the Model ------------------------------------------------------------------
# Set parameters for you population
population <- declare_population(N = N, # sample size
                                 draw_multivariate(c(X, u_0) ~ mvrnorm(
                                     n = N,
                                     mu = c(control_mean, control_mean),
                                     Sigma = matrix(c(1, rho2, rho2 , 1), 2, 2)
                                 )), # error term for the control, and some X we observe with error
                                 u_1 = rnorm(n = N, mean = rho * u_0, sd = sqrt(1 - rho^2))) # error term treatment groups
# notice here the errror terms are correlated allowing us also to model 
# the contributtions of covariates for the precision of the treatment effects.

# Define potential outcomes
potential_outcomes <- declare_potential_outcomes(Y ~ (1 - Z) * (u_0 * control_sd + control_mean) + Z * (u_1 * treatment_sd + 
                                                                                                            treatment_mean))


# Step 2: Define Inquiry --------------------------------------------------
## Define which estimand you are looking for. 
estimand <- declare_inquiry(ATE = mean(Y_Z_1 - Y_Z_0))


# Step 3: Data strategy ---------------------------------------------------
# declare your research design interventions. 
# here we are working with a complete randomization assignment. 

assignment <- declare_assignment(Z = complete_ra(N, prob = assignment_prob))
reveal_Y <- declare_reveal()


# Step 4: Declare your estimator ------------------------------------------
estimator_no_cov <- declare_estimator(Y ~ Z, method=lm, inquiry = estimand, label="no_covariate" )
estimator_cov <- declare_estimator(Y ~ Z + X, metho=lm, inquiry = estimand, label="with_covariate" )

# Write a function with the design ------------------------------------------------------
# Functon to combine all the steps from the declared design

two_arm_design <- function(
        N, # sample size
        assignment_prob, # share in the treatment
        control_mean, # mean in the control group
        control_sd, # sd in the control group
        treatment_mean, #mean in the treatment group
        treatment_sd, # sd in the treatment
        rho, # correlation between error terms 
        rho2) # correlation between covariates and error term)
{
    # start the declare design here
    # basically repeating all the steps from above
    model <- declare_population(N = N, 
                                draw_multivariate(c(X, u_0) ~ MASS::mvrnorm(
                                    n = N,
                                    mu = c(control_mean, control_mean),
                                    Sigma = matrix(c(1, rho2, rho2 , 1), 2, 2)
                                )), 
                                u_1 = rnorm(n = N, mean = rho * u_0, sd = sqrt(1 - rho^2))) + 
        declare_potential_outcomes(Y ~ (1 - Z) * (u_0 * control_sd + control_mean) + Z * (u_1 * treatment_sd + 
                                                                                              treatment_mean)) +
        declare_inquiry(ATE = mean(Y_Z_1 - Y_Z_0)) +
        declare_assignment(Z = complete_ra(N, prob = assignment_prob)) +
        declare_reveal() +
        declare_estimator(Y ~ Z, 
                          .method = lm_robust,
                          .summary = tidy, 
                          label="no_covariate" ) +
        declare_estimator(Y ~ Z + X,
                          .method = lm_robust,
                          .summary = tidy, 
                          label="with_covariate" )
    
    
}


## Post-hoc Power Analysis  ------------
### Get mean value of the outcome in the control group and the SD of the outcome in the control group.
### Assign these values in the power calculation
### Use these values to calculate the mean in the treatment based on a .2 d-cohen effect, and use the sd of the treatment group. 
### use the correlation of the covariate model. 
### this will give me the power for each test
set.seed(1310)

power <- map2(out, results, ~ {
    
    # Extract means and SDs
    control_mean <- .y$m[1]
    treatment_mean <- .y$m[1] + (0.2 * .y$sd[1])
    control_sd <- .y$sd[1]
    treatment_sd <- .y$sd[1]
    corr <- sqrt(.x)
    
    # Create the design
    design <- two_arm_design(
        N = 737,
        assignment_prob = 0.5,
        control_mean = control_mean,
        control_sd = control_sd,
        treatment_mean = treatment_mean,
        treatment_sd = treatment_sd,
        rho = 1,
        rho2 = corr #
    )
    
    # Diagnose the design
    diagnose_design(design, sims = 1000) 
    
})



# bind rows
power_df <- map2(power,list("False News \n  Accuracy", "True News \n Accuracy", 
                            "False News \n Exposure", "True News \n Exposure", 
                            "Polarization", "Subjective \n Well-Being"), ~  
                     .x$diagnosands_df %>% mutate(outcomes=.y) %>%
                     select(1:3, outcomes, contains("power")))  %>%
    bind_rows() %>%
    mutate(estimator=ifelse(estimator=="no_covariate", "ITT", " COV-ITT"), 
           outcomes=fct_rev(fct_inorder(outcomes))
    ) %>% clean_names()


# graph
ggplot(power_df, 
       aes(y=power,
           x=outcomes,
           fill=estimator)) +
    geom_col(color="black", width=.8, alpha=.2, 
             position=position_dodge(width = .8)) +
    geom_errorbar(aes(x=outcomes, y=power, ymin=power-1.96*se_power, 
                      ymax=power+1.96*se_power, color=estimator), width=.4, 
                  position=position_dodge(width = .8), color="black", alpha=.7) +
    geom_point(aes(x=outcomes, y=power, color=estimator), width=.4, 
               position=position_dodge(width = .8), color="black", alpha=.7)  +
    geom_hline(aes(yintercept=.8), linetype="dashed",  color="tomato2") +
    coord_flip() +
    scale_fill_manual(name="Estimator:", values = c("grey90", "gray10")) +
    scale_color_manual(values = c("grey90", "gray10")) +
    scale_y_continuous(breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1),
                       limits=c(0, 1)) +
    labs(title="",
         x="",
         y="Statistical Power" ,
         caption = "Bootstrapped Confidence Intervals with 1000 Simulation") +
    theme(axis.title.x = element_text(hjust=1), 
          legend.position = "bottom")

save_function("sm_fig14.png")


# Pre-Registered Power Analysis -------------------------------------------
## Here I am again using DeclareDesign to perform Power Analysis. This analysis has been pre-registered

## Step 1: Define the Model ------------------------------------------------------------------
# Set parameters for you population
population <- declare_population(N = N, # sample size
                                 draw_multivariate(c(X, u_0) ~ mvrnorm(
                                     n = N,
                                     mu = c(control_mean, control_mean),
                                     Sigma = matrix(c(1, rho2, rho2 , 1), 2, 2)
                                 )), # error term for the control, and some X we observe with error
                                 u_1 = rnorm(n = N, mean = rho * u_0, sd = sqrt(1 - rho^2))) # error term treatment groups
# notice here the errror terms are correlated allowing us also to model 
# the contributtions of covariates for the precision of the treatment effects.

# Define potential outcomes
potential_outcomes <- declare_potential_outcomes(Y ~ (1 - Z) * (u_0 * control_sd + control_mean) + Z * (u_1 * treatment_sd + 
                                                                                                            treatment_mean))


# Step 2: Define Inquiry --------------------------------------------------
## Define which estimand you are looking for. 
estimand <- declare_inquiry(ATE = mean(Y_Z_1 - Y_Z_0))


# Step 3: Data strategy ---------------------------------------------------
# declare your research design interventions. 
# here we are working with a complete randomization assignment. 

assignment <- declare_assignment(Z = complete_ra(N, prob = assignment_prob))
reveal_Y <- declare_reveal()


# Step 4: Declare your estimator ------------------------------------------
estimator_no_cov <- declare_estimator(Y ~ Z, method=lm, inquiry = estimand, label="no_covariate" )
estimator_cov <- declare_estimator(Y ~ Z + X, metho=lm, inquiry = estimand, label="with_covariate" )

# Write a function with the design ------------------------------------------------------

# Functon to combine all the steps from the declared design

two_arm_design <- function(
        N, # sample size
        assignment_prob, # share in the treatment
        control_mean, # mean in the control group
        control_sd, # sd in the control group
        treatment_mean, #mean in the treatment group
        treatment_sd, # sd in the treatment
        rho, # correlation between error terms 
        rho2) # correlation between covariates and error term)
{
    # start the declare design here
    # basically repeating all the steps from above
    model <- declare_population(N = N, 
                                draw_multivariate(c(X, u_0) ~ MASS::mvrnorm(
                                    n = N,
                                    mu = c(control_mean, control_mean),
                                    Sigma = matrix(c(1, rho2, rho2 , 1), 2, 2)
                                )), 
                                u_1 = rnorm(n = N, mean = rho * u_0, sd = sqrt(1 - rho^2))) + 
        declare_potential_outcomes(Y ~ (1 - Z) * (u_0 * control_sd + control_mean) + Z * (u_1 * treatment_sd + 
                                                                                              treatment_mean)) +
        declare_inquiry(ATE = mean(Y_Z_1 - Y_Z_0)) +
        declare_assignment(Z = complete_ra(N, prob = assignment_prob)) +
        declare_reveal() +
        declare_estimator(Y ~ Z, 
                          .method = lm_robust,
                          .summary = tidy, 
                          inquiry = estimand, 
                          label="no_covariate" ) +
        declare_estimator(Y ~ Z + X,
                          .method = lm_robust,
                          .summary = tidy, 
                          inquiry = estimand, 
                          label="with_covariate" )
    
    
}


# Simulate results ---------------------------------------------------
set.seed(1310)

# Grid with sample size and effect size
grid <- expand_grid(n=seq(100, 3000, by =100),
                    eff=seq(0.05, 0.5, by =0.05))

# Define quantities I want to analyze in the simulations
diagnosands <- declare_diagnosands(
    bias = mean(estimate - estimand),
    rmse = sqrt(mean((estimate - estimand)^2)),
    power = mean(p.value <= 0.05)
)

# run the simulations (row-wise operations)
grid <- grid %>% 
    mutate(model=map2(n, eff, ~ two_arm_design(N = .x, 
                                               assignment_prob = 0.5,
                                               control_mean = 0,
                                               control_sd = 1,
                                               treatment_mean = .y,
                                               treatment_sd = 1,
                                               rho = 1,
                                               rho2 = .2)), # declare the mode
           diagnosis=map(model, ~ diagnose_design(.x, 
                                                  diagnosands=diagnosands,
                                                  sims=1000,
                                                  bootstrap_sims = 200)), # run simulation 
           res=map(diagnosis, "diagnosands_df")) # get results


# Unnest to a single dataframe
res <- grid %>% 
    unnest(res) %>%
    ungroup() %>%
    dplyr::select(-model, -design)

# Cleaning the variables
res<- res %>%
    mutate(N_fct=as.factor(n),
           eff=as.factor(eff), 
           power=ifelse(power>0.79, "Power > 80%", "Power < 80%")) %>%
    mutate(estimator=str_to_title(str_replace_all(estimator, "_", " "))) %>%
    dplyr::select(N_fct, eff, estimator, power)


# Simulate many results ---------------------------------------------------
pal <- wes_palette("Zissou1", n=5)

# no covariate adjustment
ggplot(res %>% 
           filter(estimator=="No Covariate"), 
       aes(x=N_fct,y=eff, 
           fill=fct_rev(power)))+
    geom_tile(colour="gray95",size=0.5, alpha=.8)  +
    guides(fill=guide_legend(title="Power Results"))+ 
    labs(x="Number of Observations",
         y="") +
    scale_fill_manual(values=c(pal[1], pal[5]))  +
    facet_grid(~ estimator) +
    theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
          strip.text = element_text(family = my_font, color = "#22211d",
                                    size = 14, face="italic"), 
          plot.caption = element_text(size=10)) 

ggsave(here("output", "sm_fig13_left.png"),  width = 14, height = 8, units = "in", pointsize = 12, bg = "white")


# with covariate adjustment

ggplot(res %>% 
           filter(estimator=="With Covariate"), 
       aes(x=N_fct,y=eff, 
           fill=fct_rev(power)))+
    geom_tile(colour="gray95",size=0.5, alpha=.8)  +
    guides(fill=guide_legend(title="Power Results"))+ 
    labs(x="Number of Observations",
         y="") +
    scale_fill_manual(values=c(pal[1], pal[5]))  +
    facet_grid(~ estimator) +
    theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
          strip.text = element_text(family = my_font, color = "#22211d",
                                    size = 14, face="italic"), 
          plot.caption = element_text(size=22)) 

ggsave(here("output", "sm_fig13_right.png"),  width = 14, height = 8, units = "in", pointsize = 12, bg = "white")




